from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt


#These are the NLP sentiment tokenizers/models, and can be changed to test alternative models
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')


board_url = 'https://forums.studentdoctor.net/forums/ob-gyn.39/'
pages = 60
project_title = 'SDNOBGYN'


base_url = 'https://forums.studentdoctor.net/'
forum_url_template = f"{board_url}page-{{}}" 
full_post_urls = []

for page_number in range(1, pages):
    board_url = forum_url_template.format(page_number)
    board_html = requests.get(board_url)
    board_soup = BeautifulSoup(board_html.text, 'html.parser')
    posts = board_soup.find_all('div', attrs={'class':'structItem-title'})

    for url in posts:
        link = url.find('a')['href']
        full_link = base_url + link
        full_post_urls.append(full_link)
        
def extract_forum_data(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    result_date = [time['datetime'] for time in soup.find_all('time')]
    result_text = [div.get_text(separator=' ', strip=True) for div in soup.find_all('div', attrs={'class':'bbWrapper'})]
    return list(zip(result_date, result_text))

all_data = [ ]
for url in full_post_urls:
  all_data.extend(extract_forum_data(url))

df = pd.DataFrame(all_data, columns=['Date','Content'])
df['Date'] = pd.to_datetime(df['Date'], utc=True)
df.set_index('Date', inplace=True)

df.to_csv(project_title+'_raw_data.csv', index=True)

def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))/2

df['Sentiment'] = df['Content'].apply(lambda x: sentiment_score(x[:512]))

df.to_csv(project_title+'_data_sentiment.csv', index=True)

def resample_data(df, freq):
    return df['Sentiment'].resample(freq).mean()

daily_avg = resample_data(df, 'D')
daily_avg.index = daily_avg.index.strftime('%Y-%m-%d')

weekly_avg = resample_data(df, 'W')
weekly_avg.index = weekly_avg.index.strftime('%Y-%m-%d')

monthly_avg = resample_data(df, 'M')
monthly_avg.index = monthly_avg.index.strftime('%Y-%m')

yearly_avg = resample_data(df, 'Y')
yearly_avg.index = yearly_avg.index.strftime('%Y')

daily_avg.to_csv(project_title+'_daily_average.csv', index=True)
weekly_avg.to_csv(project_title+'_weekly_average.csv', index=True)
monthly_avg.to_csv(project_title+'_monthly_average.csv', index=True)
yearly_avg.to_csv(project_title+'_yearly_average.csv', index=True)

Dependencies¶

Variables to Define¶

Script¶